{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# columns = ['id','loan_dt','label','tag']+['f%d'%(i+1)for i in range(6745)]\n",
    "train_path_1 = './data/open_data_train_valid/train/train_1.txt'\n",
    "# train_path_5 = './data/open_data_train_valid/train/train_5.txt'\n",
    "valid_path = './data/open_data_train_valid/valid.txt'\n",
    "valid_id_path = './data/open_data_train_valid/valid_id.txt'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 28.7 s, sys: 3.3 s, total: 32 s\n",
      "Wall time: 34.4 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "train_1 = pd.read_table(train_path_1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 32.8 s, sys: 1.2 s, total: 34 s\n",
      "Wall time: 36.4 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "valid = pd.read_table(valid_path)\n",
    "valid_id = pd.read_table(valid_id_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## define pipeline函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.externals import joblib\n",
    "import autosklearn.classification\n",
    "import os\n",
    "\n",
    "def pipeline(flag, train_x, train_y, test_x, test_id):\n",
    "    '''\n",
    "    input :\n",
    "    flag(bool): 是否加载autosklearn模型\n",
    "    train_x(array)\n",
    "    train_y(array)\n",
    "    test_x(array)\n",
    "    test_id(array)\n",
    "    '''\n",
    "    if flag:\n",
    "        if not os.path.exists('automl.pkl'):\n",
    "            print('automl.pkl 不存在！')\n",
    "            return\n",
    "        automl = joblib.load('automl.pkl')\n",
    "        print('模型加载完毕！')\n",
    "        \n",
    "    else:\n",
    "        if not os.path.exists('tmp_folder'):\n",
    "            os.mkdir('tmp_folder')\n",
    "        if not os.path.exists('output_folder'):\n",
    "            os.mkdir('output_folder')\n",
    "            \n",
    "        automl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=18000,\n",
    "                                                                  per_run_time_limit=1800,\n",
    "                                                                  ml_memory_limit=16480,\n",
    "                                                                  delete_tmp_folder_after_terminate=False,\n",
    "                                                                  delete_output_folder_after_terminate=False,\n",
    "                                                                  shared_mode=True,\n",
    "                                                                  tmp_folder='./tmp_folder',\n",
    "                                                                  output_folder='./output_folder '\n",
    "                                                                 )\n",
    "        print('开始训练！')\n",
    "        automl.fit(train_x.values, train_y.values)\n",
    "        print('保存模型')\n",
    "        joblib.dump(automl, 'automl.pkl') # 保存模型\n",
    "        \n",
    "    print(automl.show_models())\n",
    "    pred = automl.predict_proba(test_x.drop(columns=['id']).values) # 预测\n",
    "\n",
    "    test_result = pd.DataFrame(columns=[\"id\",\"prob\"])\n",
    "    test_result.id = test_x.id\n",
    "    test_result.prob = pred[:,1]\n",
    "\n",
    "    if not os.path.exists('preds'):\n",
    "        os.mkdir('preds')\n",
    "    test_result.to_csv('./preds/auto_pred.csv',index=None) # 保存pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_y = train_1.label.values\n",
    "train_x = train_1.drop(columns=['id','label']).values\n",
    "test_id = valid.id.values\n",
    "test_x = valid.drop(columns=['id']).values\n",
    "\n",
    "pipeline(False, train_x, train_y, test_x, test_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
